Give basic insights into your numeric variable you have picked as output variable using one categorical variable you selected.
group_by and summarize functions.library('tidyverse')
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.5 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
animeall <- read.csv('data/anime.csv')
anime <- subset(animeall,
genre %in% c('Action', 'Drama', 'Space', 'Comedy', 'Supernatural', 'Fantasy'))
rmarkdown::paged_table(anime)
anime %>%
na.omit(cols = 'score')%>%
select(score) %>%
summarize(score_max = max(score),
score_min = min(score))
anime %>%
na.omit(cols = 'score')%>%
select(score, genre) %>%
group_by(genre) %>%
summarise(score_median = median(score),
score_max = max(score),
score_min = min(score))
Visualize the variables you selected.
ggplot(anime, aes(x=score)) +
geom_histogram(colour='white')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 36 rows containing non-finite values (stat_bin).
ggplot(anime, aes(x=score, y=genre, colour=genre)) +
geom_boxplot()
## Warning: Removed 36 rows containing non-finite values (stat_boxplot).
anime.small <- filter(anime, scored_by >1000)
ggplot(anime.small, aes(x=rank, y=score, colour=genre)) +
geom_point()
Using the all dataset, fit a regression:
fit1 <- lm(score ~ rank, data = animeall)
fit1
##
## Call:
## lm(formula = score ~ rank, data = animeall)
##
## Coefficients:
## (Intercept) rank
## 7.98340 -0.00024
summary(fit1)
##
## Call:
## lm(formula = score ~ rank, data = animeall)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2733 -0.1074 -0.0920 -0.0076 5.2672
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.983e+00 2.558e-03 3121.2 <2e-16 ***
## rank -2.400e-04 4.409e-07 -544.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4439 on 77735 degrees of freedom
## (174 observations deleted due to missingness)
## Multiple R-squared: 0.7922, Adjusted R-squared: 0.7922
## F-statistic: 2.963e+05 on 1 and 77735 DF, p-value: < 2.2e-16
ggplot(animeall, aes(y=score, x=rank, colour=genre)) +
geom_point()
## Warning: Removed 174 rows containing missing values (geom_point).
library('ggExtra')
g <- ggplot(animeall, aes(y=log(popularity), x=log(rank), colour=genre)) +
geom_point(alpha=.3) +
theme(legend.position = 'none')
ggMarginal(g, type='histogram')
Using all your input variables, fit a multiple linear regression model
fit2 <- lm(popularity ~ rank + members + episodes + favorites, data = animeall)
summary(fit2)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.159391e+03 1.710484e+01 67.781465 0.000000e+00
## rank 8.636827e-01 2.524476e-03 342.123531 0.000000e+00
## members -9.417189e-03 9.057338e-05 -103.973032 0.000000e+00
## episodes 1.500524e+00 1.859103e-01 8.071229 7.058962e-16
## favorites 1.653038e-01 2.241363e-03 73.751461 0.000000e+00
sigma(fit1)
## [1] 0.4438741
sigma(fit2)
## [1] 2231.768
fit3 <- lm(popularity ~ rank + members + episodes + type, data = animeall)
summary(fit3)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.000480e+03 2.282970e+01 87.6262080 0.000000e+00
## rank 8.354908e-01 2.491087e-03 335.3921325 0.000000e+00
## members -2.793146e-03 5.310851e-05 -52.5931942 0.000000e+00
## episodes 7.948373e+00 1.820952e-01 43.6495582 0.000000e+00
## typeMusic 1.892189e+03 5.534285e+01 34.1903143 2.756464e-254
## typeONA -4.324359e+01 4.229526e+01 -1.0224217 3.065846e-01
## typeOVA -5.556330e+02 2.830127e+01 -19.6327908 1.316164e-85
## typeSpecial -3.131052e+02 3.156313e+01 -9.9199657 3.519877e-23
## typeTV -1.996209e+03 2.293803e+01 -87.0261701 0.000000e+00
## typeUnknown -1.209083e+03 1.508551e+03 -0.8014865 4.228525e-01
fit4 <- lm(popularity ~ rank + members + episodes + type + rank:type, data = animeall)
summary(fit3)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.000480e+03 2.282970e+01 87.6262080 0.000000e+00
## rank 8.354908e-01 2.491087e-03 335.3921325 0.000000e+00
## members -2.793146e-03 5.310851e-05 -52.5931942 0.000000e+00
## episodes 7.948373e+00 1.820952e-01 43.6495582 0.000000e+00
## typeMusic 1.892189e+03 5.534285e+01 34.1903143 2.756464e-254
## typeONA -4.324359e+01 4.229526e+01 -1.0224217 3.065846e-01
## typeOVA -5.556330e+02 2.830127e+01 -19.6327908 1.316164e-85
## typeSpecial -3.131052e+02 3.156313e+01 -9.9199657 3.519877e-23
## typeTV -1.996209e+03 2.293803e+01 -87.0261701 0.000000e+00
## typeUnknown -1.209083e+03 1.508551e+03 -0.8014865 4.228525e-01
sigma(fit3)
## [1] 2133.072
sigma(fit4)
## [1] 2104.133
In this section, you will do the same you did in 1.3, but this time you will first split the data into train and test.
set.seed(...).set.seed(156) # Set seed is needed if we want
# to get the same random numbers always
train_size <- floor(0.8 * nrow(animeall))
train_inds <- sample(1:nrow(animeall), size = train_size)
test_inds <- setdiff(1:nrow(animeall), train_inds)
train <- animeall[ train_inds , ]
test <- animeall[ test_inds , ]
cat('train size:', nrow(train), '\ntest size:', nrow(test))
## train size: 62328
## test size: 15583
library('caret')
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
fit1 <- lm(popularity ~ rank, data = train)
fit2 <- lm(popularity ~ rank + members + episodes + favorites, data = train)
fit3 <- lm(popularity ~ rank + members + episodes + type, data = train)
fit4 <- lm(popularity ~ rank + members + episodes + type + rank:type, data = train)
pred1 <- predict(fit1, newdata=test)
pred2 <- predict(fit2, newdata=test)
pred3 <- predict(fit3, newdata=test)
pred4 <- predict(fit4, newdata=test)
## Warning in predict.lm(fit4, newdata = test): prediction from a rank-deficient
## fit may be misleading
rmse1 <- RMSE(pred1, test$popularity)
rmse2 <- RMSE(pred2, test$popularity)
rmse3 <- RMSE(pred3, test$popularity)
rmse4 <- RMSE(pred4, test$popularity)
rmses <- c(rmse1,rmse2,rmse3,rmse4)
rmses
## [1] 2389.85 NA NA NA
In case you want to take a picture (screenshot) of your notebook (tablet), you can use the below lines to embed the image to the output PDF file:
library('knitr')
#X
X <- c(185,175,170)
#Y
Y <- c(105,73,66)
column.names <- c("Height (cm)","Weight (kg)")
row.names <- c("","","")
result <- array(c(X,Y),dim = c(3,2),dimnames = list(row.names,column.names))
kable(result,caption = 'Simple Dataset')
| Height (cm) | Weight (kg) | |
|---|---|---|
| 185 | 105 | |
| 175 | 73 | |
| 170 | 66 |
#knitr::include_graphics('GradientDescentAlgorithm.jpg')
Hypothesis function \(h_{\theta}(x) = {\theta_0} +{\theta_1}x\)
Let \({\theta_0}\) = 0
\({\theta_1}\) = 0
\(\alpha\) = 0.1
You will use horsepower as input variable and miles per gallon (mpg) as output:
mpg (\(Y\)) and horsepower (\(X\)).
log(mpg) and log(horsepower).
The code below estimates the coefficients of linear regression using gradient descent algorithm. If you are given a single linear regression model;
\[Y = \beta_0 + \beta_1 X \]
where \(Y=[Y_1,\dots,Y_N]^T\) and \(X=[X_1,\dots,X_N]^T\) are output and input vectors containing the observations.
The algorithm estimates the parameter vector \(\theta = [\beta_0,\beta_1]\) by starting with an arbitrary \(\theta_0\) and adjusting it with the gradient of the loss function as:
\[\theta := \theta + \frac \alpha N X^T(Y - \theta X)\]
where \(\alpha\) is the step size (or learning rate) and \((Y - \theta X)^T X\) is the gradient. At each step it calculates the gradient of the loss and adjusts the parameter set accordingly.
1e-6, set alpha=0.05 run the code.
alpha=0.01
alpha=0.05 and try theta0=c(1,1) vs. theta0=c(1,-1):
epsilon = 1e-8 and try alpha=0.01, alpha=0.05 and alpha=0.1.